import pandas as pdimport seaborn as snsimport numpy as npimport matplotlib.pyplot as pltimport osxxxxxxxxxxuber_apr14= pd.read_csv('F:\Data Science projects\by_other\uber-pickups-in-new-york-city/uber-raw-data-apr14.csv',encoding='utf-8')xxxxxxxxxx##### just bcz of error I m going to add r before file path to get rid of errorxxxxxxxxxx#### You are getting this error because you are using the path to the file as a string. Change that line to something like this:xxxxxxxxxxfiles=os.listdir(r'F:\Data Science projects\by_other\uber-pickups-in-new-york-city')[-7:]filesxxxxxxxxxxfiles.remove('uber-raw-data-janjune-15.csv')xxxxxxxxxxfilesxxxxxxxxxxpath=r'F:\Data Science projects\by_other\uber-pickups-in-new-york-city'#blank dataframefinal=pd.DataFrame()for file in files: df=pd.read_csv(path+"/"+file,encoding='utf-8') final=pd.concat([df,final])xxxxxxxxxxfinal.shapexxxxxxxxxx##### Lat : The latitude of the Uber pickup##### Lon : The longitude of the Uber pickup##### Base : The TLC base company code affiliated with the Uber pickupxxxxxxxxxx##### The globe is split into an imaginary 360 sections from both top to bottom (north to south) and 180 sections from side to side (west to east). The sections running from top to bottom on a globe are called longitude, and the sections running from side to side on a globe are called latitude.##### Latitude is the measurement of distance north or south of the Equator.##### Every location on earth has a global address. Because the address is in numbers, people can communicate about location no matter what language they might speak. A global address is given as two numbers called coordinates. The two numbers are a location's latitude number and its longitude number ("Lat/Long").xxxxxxxxxxdf=final.copy()xxxxxxxxxxdf.head()xxxxxxxxxxdf.shapexxxxxxxxxxdf.dtypesxxxxxxxxxxdf['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")xxxxxxxxxxxxxxxxxxxxdf.dtypesxxxxxxxxxxdf['weekday']=df['Date/Time'].dt.day_name()df['day']=df['Date/Time'].dt.daydf['minute']=df['Date/Time'].dt.minutedf['month']=df['Date/Time'].dt.monthdf['hour']=df['Date/Time'].dt.hourxxxxxxxxxxdf.dtypesxxxxxxxxxxxxxxxxxxxxdf.head()xxxxxxxxxxxxxxxxxxxxdf['Base'].unique()xxxxxxxxxxdf['day'].unique()xxxxxxxxxxdf['weekday'].unique()xxxxxxxxxximport plotly.express as pxxxxxxxxxxxpx.bar(x=df['weekday'].value_counts().index, y=df['weekday'].value_counts().values )xxxxxxxxxxxxxxxxxxxxplt.hist(df['hour'])xxxxxxxxxxxxxxxxxxxx#### It peaks during evening time when people are logging off from workxxxxxxxxxxfor i,month in enumerate(df['month'].unique()): print(month)xxxxxxxxxxplt.figure(figsize=(40,20))for i,month in enumerate(df['month'].unique()): plt.subplot(3,2,i+1) df[df['month']==month]['hour'].hist()xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx#### Analysis of Rush of each hour in each monthxxxxxxxxxxfor i in df['month'].unique(): plt.figure(figsize=(5,3)) df[df['month']==i]['hour'].hist() xxxxxxxxxxxxxxxxxxxximport chart_studio.plotly as pyimport plotly.graph_objs as gofrom plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplotxxxxxxxxxxtrace1 = go.Bar( x = df.groupby('month')['hour'].sum().index, y = df.groupby('month')['hour'].sum(), name= 'Priority')iplot([trace1])xxxxxxxxxxplt.figure(figsize=(10,6))plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))plt.xlabel('date of the month')plt.ylabel('Total Journeys')plt.title('Journeys by Month Day')xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxplt.figure(figsize=(20,8))for i,month in enumerate(df['month'].unique(),1): plt.subplot(3,2,i) df_out=df[df['month']==month] plt.hist(df_out['day']) plt.xlabel('days in month'.format(i)) plt.ylabel('total rides')xxxxxxxxxxxxxxxxxxxxsns.set_style(style='whitegrid')sns.pointplot(x="hour",y="Lat",data=lat_df)xxxxxxxxxxax=sns.pointplot(x="hour",y="Lat", hue="weekday",data=df)ax.set_title('hoursoffday vs latiitide of passenger')xxxxxxxxxx#### to analyse which base number gets popular by month namexxxxxxxxxxdf.head()xxxxxxxxxxdf['Base'].head()xxxxxxxxxxdf.groupby(['Base','month'])['Date/Time'].count()xxxxxxxxxxbase=df.groupby(['Base','month'])['Date/Time'].count().reset_index()basexxxxxxxxxx#### to analyse which base number gets popular by month namexxxxxxxxxxplt.figure(figsize=(10,6))sns.lineplot(x='month',y='Date/Time',hue='Base',data=base)xxxxxxxxxxxxxxxxxxxx#### 2 Cross Analysis#### Through our exploration we are going to visualize:#### 1.Heatmap by Hour and Weekday.#### 2.Heatmap by Hour and Day.#### 3.Heatmap by Month and Day.#### 4.Heatmap by Month and Weekday.xxxxxxxxxx##### simplest way of creating pivot tables,first of all call groupby on 2 columns so that we will get groups ##### df.groupby(['weekday','hour']).apply(lambda x: len(x)), now "weekday" becomes rows and "hour" becomes cols##### & then call unstackxxxxxxxxxxdef count_rows(rows): return len(rows)xxxxxxxxxxby_cross = df.groupby(['weekday','hour']).apply(count_rows)by_crossxxxxxxxxxxpivot=by_cross.unstack()pivotxxxxxxxxxx##### creating heatmap so that it can be easily visualizexxxxxxxxxxplt.figure(figsize=(10,6))sns.heatmap(pivot, annot=False)xxxxxxxxxxdf.head()xxxxxxxxxxdef heatmap(col1,col2): by_cross = df.groupby([col1,col2]).apply(lambda x:len(x)) pivot=by_cross.unstack() plt.figure(figsize=(10,6)) return sns.heatmap(pivot,annot=False)xxxxxxxxxx## validating above Analysis through Heatmapheatmap('day','hour')xxxxxxxxxxheatmap('day','month')xxxxxxxxxx#### Analysing the results#### We observe that the number of trips increases each month, we can say that from April to September 2014, Uber was in a continuous improvement process.xxxxxxxxxxdf[df['month']==4]xxxxxxxxxxxxxxxxxxxxheatmap('weekday','month')xxxxxxxxxxxxxxxxxxxxplt.figure(figsize=(10,6))plt.plot(df['Lon'], df['Lat'],'r+', ms=0.5)plt.xlim(-74.2, -73.7)plt.ylim(40.6,41)xxxxxxxxxx##### We can see a number of hot spots here. Midtown Manhattan is clearly a huge bright spot.##### & these are made from Midtown to Lower Manhattan.##### Followed by Upper Manhattan and the Heights of Brooklyn.xxxxxxxxxxxxxxxxxxxx### perform Spatial Analysis using heatmap to get a clear cut of Rush on Sunday(Weekend)xxxxxxxxxxdf.head()xxxxxxxxxxdf_out=df[df['weekday']=='Sunday']df_out.head()xxxxxxxxxxdf_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()xxxxxxxxxxfrom folium.plugins import HeatMapxxxxxxxxxximport foliumfrom folium.plugins import HeatMapbasemap=folium.Map()xxxxxxxxxxHeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)basemapxxxxxxxxxxxxxxxxxxxx##### Lets create a function for a specific dayxxxxxxxxxxdef plot(df,day): df_out=df[df['weekday']==day] df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index() HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap) return basemapxxxxxxxxxxplot(df,'Sunday')